Market Basket Analysis

Mohit

2025-02-06

Data used is: https://archive.ics.uci.edu/dataset/352/online+retail

Information about dataset : This is a transactional data set which contains all the transactions occurring between 01/12/2010 and 09/12/2011 for a UK-based and registered non-store online retail.The company mainly sells unique all-occasion gifts. Many customers of the company are wholesalers.

Corelation: This analysis can be performed on Kada per day data generated by CermePOS. Will be used for product placement on shelves.

  1. To calculate best sellers of the day.
  2. To calculate best sellers of each hour, within a hour, and of each weekday.
  3. To find association rules, to know which products are sold together.
  4. Renerate association rule to understand which items should be stacked together and where to place them.
## # A tibble: 6 × 8
##   InvoiceNo StockCode Description                         Quantity InvoiceDate         UnitPrice CustomerID Country       
##   <chr>     <chr>     <chr>                                  <dbl> <dttm>                  <dbl>      <dbl> <chr>         
## 1 536365    85123A    WHITE HANGING HEART T-LIGHT HOLDER         6 2010-12-01 08:26:00      2.55      17850 United Kingdom
## 2 536365    71053     WHITE METAL LANTERN                        6 2010-12-01 08:26:00      3.39      17850 United Kingdom
## 3 536365    84406B    CREAM CUPID HEARTS COAT HANGER             8 2010-12-01 08:26:00      2.75      17850 United Kingdom
## 4 536365    84029G    KNITTED UNION FLAG HOT WATER BOTTLE        6 2010-12-01 08:26:00      3.39      17850 United Kingdom
## 5 536365    84029E    RED WOOLLY HOTTIE WHITE HEART.             6 2010-12-01 08:26:00      3.39      17850 United Kingdom
## 6 536365    22752     SET 7 BABUSHKA NESTING BOXES               2 2010-12-01 08:26:00      7.65      17850 United Kingdom
library(dplyr)

best_selling_products <- data %>%
  group_by(Description) %>%
  summarise(Total_Quantity = sum(Quantity, na.rm = TRUE)) %>%
  arrange(desc(Total_Quantity))

head(best_selling_products, 50)
## # A tibble: 50 × 2
##    Description                        Total_Quantity
##    <chr>                                       <dbl>
##  1 WORLD WAR 2 GLIDERS ASSTD DESIGNS           53847
##  2 JUMBO BAG RED RETROSPOT                     47363
##  3 ASSORTED COLOUR BIRD ORNAMENT               36381
##  4 POPCORN HOLDER                              36334
##  5 PACK OF 72 RETROSPOT CAKE CASES             36039
##  6 WHITE HANGING HEART T-LIGHT HOLDER          35317
##  7 RABBIT NIGHT LIGHT                          30680
##  8 MINI PAINT SET VINTAGE                      26437
##  9 PACK OF 12 LONDON TISSUES                   26315
## 10 PACK OF 60 PINK PAISLEY CAKE CASES          24753
## # ℹ 40 more rows
ggplot(best_selling_products[1:10, ], aes(x = reorder(Description, Total_Quantity), y = Total_Quantity)) +
    #geom_point()+
    geom_bar(stat = "identity", fill = "#9c9797", colour ="#0cf00c") +
    coord_flip() +
    labs(title = "Top 20 Best Selling Products", x = "Product", y = "Total Quantity Sold")   

p <- ggplot(best_selling_products[1:10, ], aes(x = factor(Description), y = Total_Quantity, fill = Description)) +
    geom_bar(stat = "identity") +
    labs(title = "Top 10 Best Selling Products",
         x = "Product",
         y = "Total Quantity Sold",
         fill = "Product") +
    theme_minimal() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Convert to Plotly
ggplotly(p)
library(lubridate)
library(dplyr)

best_selling_time <- data %>%
  mutate(Hour = hour(InvoiceDate)) %>%
  group_by(Hour, Description) %>%
  summarise(Total_Quantity = sum(Quantity, na.rm = TRUE), .groups = "drop") %>%
  filter(Total_Quantity > 0) %>%
  arrange(desc(Total_Quantity))

head(best_selling_time, 50)
## # A tibble: 50 × 3
##     Hour Description                        Total_Quantity
##    <int> <chr>                                       <dbl>
##  1    15 ASSTD DESIGN 3D PAPER STICKERS              12677
##  2    12 WORLD WAR 2 GLIDERS ASSTD DESIGNS           12213
##  3    10 WORLD WAR 2 GLIDERS ASSTD DESIGNS            8647
##  4    12 JUMBO BAG RED RETROSPOT                      7567
##  5    11 JUMBO BAG RED RETROSPOT                      7539
##  6    10 SMALL POPCORN HOLDER                         7334
##  7    12 PACK OF 72 RETROSPOT CAKE CASES              7191
##  8    10 JUMBO BAG RED RETROSPOT                      7080
##  9    13 WORLD WAR 2 GLIDERS ASSTD DESIGNS            7056
## 10    12 WHITE HANGING HEART T-LIGHT HOLDER           7003
## # ℹ 40 more rows
tail(best_selling_time, 50)
## # A tibble: 50 × 3
##     Hour Description                      Total_Quantity
##    <int> <chr>                                     <dbl>
##  1    20 NO SINGING METAL SIGN                         1
##  2    20 NUMBER TILE COTTAGE GARDEN No                 1
##  3    20 OPEN CLOSED METAL SIGN                        1
##  4    20 OVAL WALL MIRROR DIAMANTE                     1
##  5    20 PACK OF 20 NAPKINS PANTRY DESIGN              1
##  6    20 PACK OF 20 SPACEBOY NAPKINS                   1
##  7    20 PACK OF 60 MUSHROOM CAKE CASES                1
##  8    20 PING! MICROWAVE PLATE                         1
##  9    20 PINK PAISLEY SQUARE TISSUE BOX                1
## 10    20 PINK PAPER PARASOL                            1
## # ℹ 40 more rows
range(best_selling_time$Hour)
## [1]  6 20
# Get the best-selling product for each hour
best_selling_per_hour <- best_selling_time %>%
  group_by(Hour) %>%
  slice_max(order_by = Total_Quantity, n = 1) # Selects the top product per hour

head(best_selling_per_hour, 50)
## # A tibble: 15 × 3
## # Groups:   Hour [15]
##     Hour Description                        Total_Quantity
##    <int> <chr>                                       <dbl>
##  1     6 DOG BOWL VINTAGE CREAM                          1
##  2     7 RABBIT NIGHT LIGHT                            912
##  3     8 WHITE HANGING HEART T-LIGHT HOLDER           2553
##  4     9 PACK OF 72 RETROSPOT CAKE CASES              5247
##  5    10 WORLD WAR 2 GLIDERS ASSTD DESIGNS            8647
##  6    11 JUMBO BAG RED RETROSPOT                      7539
##  7    12 WORLD WAR 2 GLIDERS ASSTD DESIGNS           12213
##  8    13 WORLD WAR 2 GLIDERS ASSTD DESIGNS            7056
##  9    14 JUMBO BAG RED RETROSPOT                      4750
## 10    15 ASSTD DESIGN 3D PAPER STICKERS              12677
## 11    16 JUMBO BAG RED RETROSPOT                      3504
## 12    17 POPCORN HOLDER                               3386
## 13    18 POPCORN HOLDER                               4811
## 14    19 LETTER SHAPE PENCIL SHARPENER                1600
## 15    20 HOMEMADE JAM SCENTED CANDLES                  696
# Plot
b <- ggplot(best_selling_per_hour, aes(x = factor(Hour), y = Total_Quantity, fill = Description)) +
  geom_bar(stat = "identity") +
  labs(title = "Best Selling Product for Each Hour",
       x = "Hour of the Day",
       y = "Total Quantity Sold",
       fill = "Product") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
ggplotly(b)
# Get the top 5 best-selling products for each hour
top_5_per_hour <- best_selling_time %>%
  group_by(Hour) %>%
  slice_max(order_by = Total_Quantity, n = 5) %>%
  ungroup()

head(top_5_per_hour, 50)
## # A tibble: 50 × 3
##     Hour Description                        Total_Quantity
##    <int> <chr>                                       <dbl>
##  1     6 DOG BOWL VINTAGE CREAM                          1
##  2     7 RABBIT NIGHT LIGHT                            912
##  3     7 JUMBO BAG PINK POLKADOT                       400
##  4     7 SOMBRERO                                      400
##  5     7 HOT WATER BOTTLE KEEP CALM                    360
##  6     7 MINI PAINT SET VINTAGE                        360
##  7     8 WHITE HANGING HEART T-LIGHT HOLDER           2553
##  8     8 JUMBO BAG RED RETROSPOT                      1880
##  9     8 HEART OF WICKER SMALL                        1744
## 10     8 MINI PAINT SET VINTAGE                       1584
## # ℹ 40 more rows
# Plot
a <- ggplot(top_5_per_hour, aes(x = reorder(Description, Total_Quantity), 
                           y = Total_Quantity, fill = Description)) +
  geom_bar(stat = "identity", show.legend = FALSE) +
  facet_wrap(~ Hour, scales = "free_y") +  # Creates a grid of plots by Hour
  coord_flip() +  # Flips bars for better readability
  labs(title = "Top 5 Best-Selling Products for Each Hour",
       x = "Product",
       y = "Total Quantity Sold") +
  theme_minimal()

ggplotly(a)
library(dplyr)
library(ggplot2)
library(plotly)
library(lubridate)

# Ensure Quantity is numeric
data <- data %>%
  mutate(Quantity = as.numeric(Quantity)) 

# Best selling products based on the day of the week
best_selling_day <- data %>%
  mutate(Weekday = weekdays(InvoiceDate)) %>%
  group_by(Weekday, Description) %>%
  summarise(Total_Quantity = sum(Quantity, na.rm = TRUE), .groups = "drop") %>%
  arrange(desc(Total_Quantity))

# Check results
head(best_selling_day, 10)
## # A tibble: 10 × 3
##    Weekday   Description                        Total_Quantity
##    <chr>     <chr>                                       <dbl>
##  1 Thursday  WORLD WAR 2 GLIDERS ASSTD DESIGNS           18051
##  2 Friday    ASSTD DESIGN 3D PAPER STICKERS              12793
##  3 Thursday  ASSORTED COLOUR BIRD ORNAMENT               11409
##  4 Thursday  JUMBO BAG RED RETROSPOT                     11283
##  5 Wednesday WORLD WAR 2 GLIDERS ASSTD DESIGNS           10315
##  6 Tuesday   WHITE HANGING HEART T-LIGHT HOLDER           9954
##  7 Wednesday JUMBO BAG RED RETROSPOT                      9934
##  8 Thursday  MINI PAINT SET VINTAGE                       9348
##  9 Wednesday BROCADE RING PURSE                           9154
## 10 Tuesday   JUMBO BAG RED RETROSPOT                      8970
# Visualization
c <- ggplot(best_selling_day, aes(x = factor(Weekday, levels = c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday")), 
                                  y = Total_Quantity, fill = Description)) +
  geom_bar(stat = "identity") +
  labs(title = "Best Selling Product for Each Weekday",
       x = "Weekday",
       y = "Total Quantity Sold",
       fill = "Product") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Convert to interactive plot
ggplotly(c)
library(arules)
library(arulesViz)
library(dplyr)
library(ggplot2)

# Convert data to transactions format
transactions <- as(split(data$Description, data$InvoiceNo), "transactions")
## Warning in asMethod(object): removing duplicated items in transactions
# Apply Apriori Algorithm
rules <- apriori(transactions, parameter = list(support = 0.01, confidence = 0.3))
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen maxlen target  ext
##         0.3    0.1    1 none FALSE            TRUE       5    0.01      1     10  rules TRUE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 259 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[4211 item(s), 25900 transaction(s)] done [0.10s].
## sorting and recoding items ... [590 item(s)] done [0.01s].
## creating transaction tree ... done [0.01s].
## checking subsets of size 1 2 3 4 done [0.02s].
## writing ... [681 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].
# Convert rules to a tidy data frame
rules_df <- DATAFRAME(rules, separate = TRUE)  # Explicit conversion

# Extract only the top 10 rules based on lift
top_rules <- rules_df %>%
  arrange(desc(lift)) %>%
  head(10)

# Plot using ggplot2
d <- ggplot(top_rules, aes(x = reorder(LHS, lift), y = lift, fill = RHS)) +
  geom_bar(stat = "identity") +
  coord_flip() +  # Flip for better readability
  labs(title = "Top 10 Association Rules by Lift",
       x = "LHS (Antecedent)",
       y = "Lift",
       fill = "RHS (Consequent)") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggplotly(d)